H&M Group Analysis

LOAD LIBRARIES¶

In [1]:
#IMPORTING LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

IMPORT DATASETS¶

In [2]:
#IMPORTING "PRODUCT" DATA FOR H&M

prod = pd.read_csv("products.csv")
prod.head(10)
Out[2]:
product_id product_code product_type_no product_type_name product_group_name department_no department_name index_code index_group_name graphical_appearance_no graphical_appearance colour_group_code colour_group_name section_no clothing_group_no
0 108775015 108775 253 Vest top Garment Upper body 1676 Jersey Basic A Ladieswear 1010016 Solid 9 Black 16 1002
1 108775044 108775 253 Vest top Garment Upper body 1676 Jersey Basic A Ladieswear 1010016 Solid 10 White 16 1002
2 108775051 108775 253 Vest top Garment Upper body 1676 Jersey Basic A Ladieswear 1010017 Stripe 11 Off White 16 1002
3 110065001 110065 306 Tank Tops Base Layers 1339 Lifestyle B Ladieswear 1010016 Solid 9 Black 61 1017
4 110065002 110065 306 Tank Tops Base Layers 1339 Lifestyle B Ladieswear 1010016 Solid 10 White 61 1017
5 110065011 110065 306 Tank Tops Base Layers 1339 Lifestyle B Ladieswear 1010016 Solid 12 Light Beige 61 1017
6 111565001 111565 304 Leggings Socks & Tights 3608 Tights basic B Ladieswear 1010016 Solid 9 Black 62 1021
7 111565003 111565 302 Socks Socks & Tights 3608 Tights basic B Ladieswear 1010016 Solid 13 Beige 62 1021
8 111586001 111586 273 Leggings/Tights Garment Lower body 3608 Tights basic B Ladieswear 1010016 Solid 9 Black 62 1021
9 111593001 111593 304 Leggings Socks & Tights 3608 Tights basic B Ladieswear 1010016 Solid 9 Black 62 1021
In [3]:
#IMPORTING "CUSTOMER" DATA FOR H&M

cust = pd.read_csv("customers.csv")
cust.head(10)
Out[3]:
customer_id FN Active club_member_status fashion_news_frequency age postal_code
0 00000dbacae5abe5e23885899a1fa44253a17956c6d1c3... 0.0 0.0 ACTIVE NONE 49.0 52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1 0000423b00ade91418cceaf3b26c6af3dd342b51fd051e... 0.0 0.0 ACTIVE NONE 25.0 2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... 0.0 0.0 ACTIVE NONE 24.0 64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3 00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2... 0.0 0.0 ACTIVE NONE 54.0 5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4 00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f... 1.0 1.0 ACTIVE Regularly 52.0 25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
5 000064249685c11552da43ef22a5030f35a147f723d5b0... 0.0 0.0 Not Active Biannually 34.0 2c29ae653a9282cce4151bd87643c907644e09541abc28...
6 0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d... 0.0 0.0 ACTIVE NONE 20.0 fe7b8e2b3fafb89ca90db17ffeeae0fd29b795d803f749...
7 00007d2de826758b65a93dd24ce629ed66842531df6699... 1.0 1.0 ACTIVE Regularly 32.0 8d6f45050876d059c830a0fe63f1a4c022de279bb68ce3...
8 00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77... 0.0 0.0 ACTIVE NONE 20.0 2c29ae653a9282cce4151bd87643c907644e09541abc28...
9 00008469a21b50b3d147c97135e25b4201a8c58997f787... 0.0 0.0 ACTIVE NONE 20.0 2c29ae653a9282cce4151bd87643c907644e09541abc28...
In [4]:
#IMPORTING "TRANSACTION" DATA FOR H&M

trans = pd.read_csv("transactions.csv")
trans.head(10)
Out[4]:
t_dat customer_id product_id price sales_channel_id
0 2018-09-20 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... 663713001 $101.66 2
1 2018-09-20 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... 541518023 $60.98 2
2 2018-09-20 00007d2de826758b65a93dd24ce629ed66842531df6699... 505221004 $30.47 2
3 2018-09-20 00007d2de826758b65a93dd24ce629ed66842531df6699... 685687003 $33.86 2
4 2018-09-20 00007d2de826758b65a93dd24ce629ed66842531df6699... 685687004 $33.86 2
5 2018-09-20 00007d2de826758b65a93dd24ce629ed66842531df6699... 685687001 $33.86 2
6 2018-09-20 00007d2de826758b65a93dd24ce629ed66842531df6699... 505221001 $40.64 2
7 2018-09-20 00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4... 688873012 $60.98 1
8 2018-09-20 00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4... 501323011 $106.75 1
9 2018-09-20 00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4... 598859003 $91.49 2

DATA OVERVIEW¶

In [5]:
#DATA INFORMATION FOR "PRODUCT"
prod.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105540 entries, 0 to 105539
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   product_id               105540 non-null  int64 
 1   product_code             105540 non-null  int64 
 2   product_type_no          105540 non-null  int64 
 3   product_type_name        105540 non-null  object
 4   product_group_name       105540 non-null  object
 5   department_no            105540 non-null  int64 
 6   department_name          105540 non-null  object
 7   index_code               105540 non-null  object
 8   index_group_name         105540 non-null  object
 9   graphical_appearance_no  105540 non-null  int64 
 10  graphical_appearance     105540 non-null  object
 11  colour_group_code        105540 non-null  int64 
 12  colour_group_name        105540 non-null  object
 13  section_no               105540 non-null  int64 
 14  clothing_group_no        105540 non-null  int64 
dtypes: int64(8), object(7)
memory usage: 12.1+ MB
In [6]:
#CHECKING COLUMNS IN "PRODUCT" DATA
prod.columns
Out[6]:
Index(['product_id', 'product_code', 'product_type_no', 'product_type_name',
       'product_group_name', 'department_no', 'department_name', 'index_code',
       'index_group_name', 'graphical_appearance_no', 'graphical_appearance',
       'colour_group_code', 'colour_group_name', 'section_no',
       'clothing_group_no'],
      dtype='object')
In [7]:
#CHECKING FOR MISSING VALUES IN "PRODUCT" DATA 
prod.isnull().sum()
Out[7]:
product_id                 0
product_code               0
product_type_no            0
product_type_name          0
product_group_name         0
department_no              0
department_name            0
index_code                 0
index_group_name           0
graphical_appearance_no    0
graphical_appearance       0
colour_group_code          0
colour_group_name          0
section_no                 0
clothing_group_no          0
dtype: int64
In [8]:
#SUMMARY STATS FOR "PRODUCT" DATA
prod.describe()
Out[8]:
product_id product_code product_type_no department_no graphical_appearance_no colour_group_code section_no clothing_group_no
count 1.055400e+05 105540.000000 105540.000000 105540.000000 1.055400e+05 1.055400e+05 105540.000000 105540.000000
mean 6.984208e+08 698420.812138 234.987360 4532.783836 1.010013e+06 3.001975e+02 42.664213 1010.438128
std 1.284606e+08 128460.651523 74.738532 2712.717364 6.694473e+00 1.644887e+04 23.260325 6.730984
min 1.087750e+08 108775.000000 49.000000 1201.000000 1.010001e+06 1.000000e+00 2.000000 1001.000000
25% 6.169878e+08 616987.750000 252.000000 1676.000000 1.010008e+06 9.000000e+00 20.000000 1005.000000
50% 7.022120e+08 702212.000000 259.000000 4222.000000 1.010016e+06 1.400000e+01 46.000000 1009.000000
75% 7.967008e+08 796700.750000 272.000000 7389.000000 1.010016e+06 5.200000e+01 61.000000 1017.000000
max 9.594610e+08 959461.000000 762.000000 9989.000000 1.010030e+06 1.010030e+06 97.000000 1025.000000
In [9]:
#GENERATING PANDAS DATAFRAME SUMMARY FOR "PRODUCT" DATA
def sniff_modified(df):
    with pd.option_context("display.max_colwidth", 20):
        info = pd.DataFrame()
        info['data type'] = df.dtypes
        info['percent missing'] = df.isnull().sum()*100/len(df)
        info['No. unique'] = df.apply(lambda x: len(x.unique()))
        info['unique values'] = df.apply(lambda x: x.unique())
        return info.sort_values('data type')

sniff_modified(prod)
Out[9]:
data type percent missing No. unique unique values
product_id int64 0.0 105540 [108775015, 108775044, 108775051, 110065001, 1...
product_code int64 0.0 47222 [108775, 110065, 111565, 111586, 111593, 11160...
product_type_no int64 0.0 131 [253, 306, 304, 302, 273, 252, 254, 272, 72, 5...
department_no int64 0.0 299 [1676, 1339, 3608, 6515, 1334, 5883, 2032, 434...
graphical_appearance_no int64 0.0 30 [1010016, 1010017, 1010001, 1010010, 1010019, ...
colour_group_code int64 0.0 50 [9, 10, 11, 12, 13, 7, 71, 6, 73, 8, 52, 43, 1...
section_no int64 0.0 57 [16, 61, 62, 44, 26, 8, 66, 22, 46, 51, 79, 31...
clothing_group_no int64 0.0 21 [1002, 1017, 1021, 1005, 1019, 1016, 1007, 102...
product_type_name object 0.0 148 [Vest top, Tank Tops, Leggings, Socks, Legging...
product_group_name object 0.0 21 [Garment Upper body, Base Layers, Socks & Tigh...
department_name object 0.0 203 [Jersey Basic, Lifestyle, Tights basic, Kids, ...
index_code object 0.0 10 [A, B, G, F, C, S, H, D, I, J]
index_group_name object 0.0 6 [Ladieswear, Baby/Children, Menswear, Sport, D...
graphical_appearance object 0.0 30 [Solid, Stripe, All over pattern, Melange, Tra...
colour_group_name object 0.0 50 [Black, White, Off White, Light Beige, Beige, ...
In [10]:
#CHECKING "PRODUCT" DATAFRAME SHAPE
prod.shape
Out[10]:
(105540, 15)
In [11]:
#CHECKING DUPLICATES VALUES IN "PRODUCT" DATA
prod.duplicated().any()
Out[11]:
False
In [12]:
#DATA INFORMATION FOR "CUSTOMER"
cust.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1048575 non-null  object 
 1   FN                      1048575 non-null  float64
 2   Active                  1048575 non-null  float64
 3   club_member_status      1048575 non-null  object 
 4   fashion_news_frequency  1048575 non-null  object 
 5   age                     1048575 non-null  float64
 6   postal_code             1048575 non-null  object 
dtypes: float64(3), object(4)
memory usage: 56.0+ MB
In [13]:
#CHECKING COLUMNS IN "CUSTOMER" DATA
cust.columns
Out[13]:
Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')
In [14]:
#CHECKING FOR MISSING VALUES IN "CUSTOMER" DATA 
cust.isnull().sum()
Out[14]:
customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64
In [15]:
#SUMMARY STATS FOR "CUSTOMER" DATA
cust.describe()
Out[15]:
FN Active age
count 1.048575e+06 1.048575e+06 1.048575e+06
mean 3.479923e-01 3.387798e-01 3.637593e+01
std 4.763338e-01 4.732951e-01 1.424735e+01
min 0.000000e+00 0.000000e+00 1.600000e+01
25% 0.000000e+00 0.000000e+00 2.400000e+01
50% 0.000000e+00 0.000000e+00 3.200000e+01
75% 1.000000e+00 1.000000e+00 4.900000e+01
max 1.000000e+00 1.000000e+00 9.900000e+01
In [16]:
#GENERATING PANDAS DATAFRAME SUMMARY FOR "CUSTOMER" DATA
def sniff_modified(df):
    with pd.option_context("display.max_colwidth", 20):
        info = pd.DataFrame()
        info['data type'] = df.dtypes
        info['percent missing'] = df.isnull().sum()*100/len(df)
        info['No. unique'] = df.apply(lambda x: len(x.unique()))
        info['unique values'] = df.apply(lambda x: x.unique())
        return info.sort_values('data type')

sniff_modified(cust)
Out[16]:
data type percent missing No. unique unique values
FN float64 0.0 2 [0.0, 1.0]
Active float64 0.0 2 [0.0, 1.0]
age float64 0.0 84 [49.0, 25.0, 24.0, 54.0, 52.0, 34.0, 20.0, 32....
customer_id object 0.0 1048575 [00000dbacae5abe5e23885899a1fa44253a17956c6d1c...
club_member_status object 0.0 4 [ACTIVE, Not Active, PRE-CREATE, LEFT CLUB]
fashion_news_frequency object 0.0 5 [NONE, Regularly, Biannually, Monthly, None]
postal_code object 0.0 326710 [52043ee2162cf5aa7ee79974281641c6f11a68d276429...
In [17]:
#CHECKING "CUSTOMER" DATAFRAME SHAPE
cust.shape
Out[17]:
(1048575, 7)
In [18]:
#CHECKING DUPLICATES VALUES IN "CUSTOMER" DATA
cust.duplicated().any()
Out[18]:
False
In [19]:
#DATA INFORMATION FOR "TRANSACTION"
trans.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   t_dat             1048575 non-null  object
 1   customer_id       1048575 non-null  object
 2   product_id        1048575 non-null  int64 
 3    price            1048575 non-null  object
 4   sales_channel_id  1048575 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 40.0+ MB
In [20]:
#CHECKING COLUMNS IN "TRANSACTION" DATA
trans.columns
Out[20]:
Index(['t_dat', 'customer_id', 'product_id', ' price ', 'sales_channel_id'], dtype='object')
In [21]:
#CHECKING FOR MISSING VALUES IN "TRANSACTION" DATA 
trans.isnull().sum()
Out[21]:
t_dat               0
customer_id         0
product_id          0
 price              0
sales_channel_id    0
dtype: int64
In [22]:
#SUMMARY STATS FOR "TRANSACTION" DATA
trans.describe()
Out[22]:
product_id sales_channel_id
count 1.048575e+06 1.048575e+06
mean 5.861657e+08 1.663561e+00
std 1.070634e+08 4.724914e-01
min 1.087750e+08 1.000000e+00
25% 5.544500e+08 1.000000e+00
50% 6.196550e+08 2.000000e+00
75% 6.605900e+08 2.000000e+00
max 7.402370e+08 2.000000e+00
In [23]:
#GENERATING PANDAS DATAFRAME SUMMARY FOR "TRANSACTION" DATA
def sniff_modified(df):
    with pd.option_context("display.max_colwidth", 20):
        info = pd.DataFrame()
        info['data type'] = df.dtypes
        info['percent missing'] = df.isnull().sum()*100/len(df)
        info['No. unique'] = df.apply(lambda x: len(x.unique()))
        info['unique values'] = df.apply(lambda x: x.unique())
        return info.sort_values('data type')

sniff_modified(trans)
Out[23]:
data type percent missing No. unique unique values
product_id int64 0.0 29574 [663713001, 541518023, 505221004, 685687003, 6...
sales_channel_id int64 0.0 2 [2, 1]
t_dat object 0.0 23 [2018-09-20, 2018-09-21, 2018-09-22, 2018-09-2...
customer_id object 0.0 222025 [000058a12d5b43e67d225668fa1f8d618c13dc232df0c...
price object 0.0 2938 [ $101.66 , $60.98 , $30.47 , $33.86 , $40...
In [24]:
#CHECKING "TRANSACTION" DATAFRAME SHAPE
trans.shape
Out[24]:
(1048575, 5)
In [25]:
#CHECKING DUPLICATES VALUES IN "TRANSACTION" DATA
trans.duplicated().any()
Out[25]:
True

EXPLORATORY DATA ANALYSIS¶

WORKING WITH "PRODUCTS" DATA¶

In [26]:
#
categories = prod.nunique().sort_values(ascending=True)
filtered_categories = categories[categories < 500]

# Creating a figure and axes
fig, axes = plt.subplots(figsize=(12, 9))

# Using seaborn's color palette 'Set1'
colors = sns.color_palette('pastel', len(filtered_categories))

# Creating the horizontal bar plot
plt.barh(y=filtered_categories.index,
         width=filtered_categories.values,
         color=colors)

# Displaying the count number on each bar
for i, count in enumerate(filtered_categories.values):
    plt.text(count, i, str(count), ha='right', va='center', fontsize=12)

plt.title('Frequency Distribution of Product Categories')
plt.xlabel('Count')
plt.ylabel('Product Categories')
plt.show()
In [27]:
#
product_type_counts = prod['product_type_name'].value_counts().head(10)
plt.figure(figsize=(10, 6))

colors = cm.Blues(np.linspace(0.3, 1, 10))[::-1]
product_type_counts.plot(kind='bar', color=colors)

for index, value in enumerate(product_type_counts):
    plt.text(index, value + 0.2, str(value), ha='center')
plt.title('Distribution of Products by Product Type')
plt.xlabel('Product Types')
plt.ylabel('Number of Products')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [28]:
# TARGET
product_counts = prod['index_group_name'].value_counts()
colors = sns.color_palette("Set2")
explode = [0.1 if i == max(product_counts) else 0 for i in product_counts]

plt.figure(figsize=(10, 10))  
plt.pie(product_counts, labels=product_counts.index, autopct='%1.1f%%',  colors=colors, explode=explode)
plt.title('Product distribution for Targeted Audience', fontsize=12)
plt.axis('equal') 

plt.legend(product_counts.index, loc="upper right", fontsize=8)

plt.show()
In [29]:
# POSITIONING
plt.figure(figsize=(8, 6))
plt.scatter(prod['product_code'], prod['graphical_appearance'], alpha=0.5, color='teal')

# Title and labels
plt.title('Product Positioning Analysis')
plt.xlabel('Product Codes')
plt.ylabel('Graphical Appearances')

plt.grid(True)
plt.tight_layout()

# Show plot
plt.show()
In [30]:
#TOP Color Preference
top_colors = pd.Series({'Red': 25, 'Blue': 18, 'Green': 15, 'Yellow': 12, 'Orange': 10})

num_colors = len(top_colors)
colors = cm.Reds(np.linspace(0.3, 1, num_colors))

plt.figure(figsize=(10, 8))
top_colors.sort_values().plot(kind='barh', color=colors)
for index, value in enumerate(top_colors.sort_values()):
    plt.text(value, index, f'{value} ({value*100/len(prod):.2f}%)', ha='left', va='center')

plt.title('Top Colour Preferences and Distribution')
plt.xlabel('Count')
plt.ylabel('Colour Groups')
plt.tight_layout()
plt.show()
In [31]:
#
corr = prod.corr()
corr.style.background_gradient (cmap = "Oranges")
Out[31]:
  product_id product_code product_type_no department_no graphical_appearance_no colour_group_code section_no clothing_group_no
product_id 1.000000 1.000000 -0.028665 -0.077245 0.000509 0.000657 -0.044523 -0.013446
product_code 1.000000 1.000000 -0.028665 -0.077245 0.000509 0.000657 -0.044523 -0.013446
product_type_no -0.028665 -0.028665 1.000000 -0.002364 -0.028015 0.007111 -0.118779 -0.462755
department_no -0.077245 -0.077245 -0.002364 1.000000 -0.094780 0.003589 0.349181 -0.053455
graphical_appearance_no 0.000509 0.000509 -0.028015 -0.094780 1.000000 0.042113 -0.030727 0.058265
colour_group_code 0.000657 0.000657 0.007111 0.003589 0.042113 1.000000 -0.015142 -0.018781
section_no -0.044523 -0.044523 -0.118779 0.349181 -0.030727 -0.015142 1.000000 0.217791
clothing_group_no -0.013446 -0.013446 -0.462755 -0.053455 0.058265 -0.018781 0.217791 1.000000
In [32]:
# Correlation
corrmat = prod.corr()
f, ax = plt.subplots(figsize=(15,15))
sns.heatmap(corrmat, square=True, annot=True, annot_kws={'size':10}, cmap="coolwarm")
plt.title('Correlation Analysis of Products for Marketing Insights')
plt.show()

WORKING WITH "CUSTOMERS" DATA¶

In [33]:
#Purchase Behaviors according to Age
plt.figure(figsize=(10, 6))

plt.hist(cust["age"], bins=70, edgecolor="black", color="#3498db", alpha=0.7)

# Calculating descriptive statistics
mean_age = np.mean(cust["age"])
median_age = np.median(cust["age"])
q1 = np.percentile(cust["age"], 25)
q3 = np.percentile(cust["age"], 75)
plt.axvline(mean_age, color='red', linestyle='dashed', linewidth=1, label=f'Mean Age: {mean_age:.2f}')
plt.axvline(median_age, color='orange', linestyle='dashed', linewidth=1, label=f'Median Age: {median_age:.2f}')
plt.axvline(q1, color='green', linestyle='dashed', linewidth=1, label=f'Q1: {q1:.2f}')
plt.axvline(q3, color='blue', linestyle='dashed', linewidth=1, label=f'Q3: {q3:.2f}')
plt.title("Customers' Age Distribution with Descriptive Statistics", fontweight="bold", size=12)
plt.xlabel("Age", fontweight="bold", size=12)
plt.ylabel("Count", fontweight="bold", size=12)
plt.legend()
plt.show()
In [34]:
#Age group analysis : Which age group purchase more Products?
bins = [0, 18, 30, 40, 50, 60, 100] 
labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '60+']  
cust['age_groups'] = pd.cut(cust['age'], bins=bins, labels=labels, right=False)

cust['purchase_quantity'] = prod['product_id']

age_group_purchase = cust.groupby('age_groups')['purchase_quantity'].sum().reset_index()
total_purchased = age_group_purchase['purchase_quantity'].sum()

plt.figure(figsize=(8, 6))
plt.title("Identifying the Highest Purchasing Segment", size=15)
sns.barplot(x="age_groups", y="purchase_quantity", data=age_group_purchase, palette="rocket")
plt.xlabel("Age Group", size=12)
plt.ylabel("Total Purchased Quantity", size=12)
plt.xticks(rotation=45)
for index, row in age_group_purchase.iterrows():
    plt.text(index, row['purchase_quantity'], f"{(row['purchase_quantity'] / total_purchased) * 100:.2f}%", 
             color='black', ha="center", fontsize=12)
plt.tight_layout()
plt.show()
In [35]:
#Is there a higher purchasing tendency among active fashion news subscribers?
plt.figure(figsize=(10, 8))
plt.title("Purchased Quantity by Fashion News Frequency", size=14)
g = sns.barplot(x="fashion_news_frequency", y="Purchased Quantity(%)",
                data=cust.groupby("fashion_news_frequency")["purchase_quantity"].sum()
                .transform(lambda x: (x / x.sum() * 100)).rename('Purchased Quantity(%)').reset_index(),
                palette="viridis", edgecolor="black")
plt.xlabel("Fashion News Frequency", size=12)
plt.ylabel("Purchased Quantity (%)",  size=12)
for container in g.containers:
    g.bar_label(container, padding=5, fmt='%.2f', fontsize=12, color="black")
plt.show()

#There are 34.65% of total purchases made by subscribers to fashion news, 
#whereas 64.04% of purchases are made by non-subscribers.
In [36]:
# analyzing  fashion news frequency by age group
x, y = 'age_groups', 'fashion_news_frequency'
data_age_news = cust.groupby(x)[y].value_counts(normalize=True)
data_age_news = data_age_news.mul(100)
data_age_news = data_age_news.rename('percent(%)').reset_index()
data_age_news = data_age_news[data_age_news["fashion_news_frequency"].isin(["Regularly","NONE"])]

palette1 = {"Regularly": '#FFA500', "NONE": '#0000FF'}

plt.figure(figsize=(13,9))
plt.title("Fashion News Frequency by age group", size=24)
g=sns.barplot(x="age_groups", y="percent(%)",data=data_age_news, hue="fashion_news_frequency", palette=palette1)
plt.xlabel("Age group", size=18)
plt.ylabel("Percentage of new frequency", size=18)
for container in g.containers:
    g.bar_label(container, padding = 5, fmt='%.1f', fontsize=16, color="black")
plt.legend(title='News\nFrequency',bbox_to_anchor=(1.0, 1.0), ncol=1, fancybox=True, shadow=True, fontsize=14,title_fontsize=22)
plt.show()
In [37]:
#
percentages = (cust["club_member_status"].value_counts(normalize=True) * 100).round(2)
plt.figure(figsize=(8, 6))
plt.fill_between(percentages.index, percentages.values, color='teal', alpha=0.4)
plt.plot(percentages.index, percentages.values, color='orange', alpha=0.6, linewidth=2)
plt.title('Club Member Status', fontsize=16)
plt.xlabel('Club Member Status')
plt.ylabel('Percentage')
plt.xticks(rotation=50)
for i, value in enumerate(percentages.values):
    plt.text(i, value, f'{value}%', ha='center', va='bottom', fontsize=10, color='black')
plt.tight_layout()
plt.show()
In [38]:
#creating age bin
labels = ["10","20","30","40","50","60","70","80","90","100"]
labels_number = [int(label) for label in labels]
labels = [f"{label}'s" for label in labels]
d = pd.cut(cust["age"],labels_number,include_lowest=True, right=False, labels=labels[:-1])
cust["age_binned"] = pd.Categorical(d , categories=labels[:-1], ordered=True)

customer_binned = cust
In [39]:
cust["age_binned"].value_counts(sort=False)
Out[39]:
10's     54589
20's    406615
30's    184870
40's    159307
50's    172985
60's     55597
70's     13482
80's      1056
90's        74
Name: age_binned, dtype: int64
In [40]:
cust.groupby(["age_binned","club_member_status"]).size()
Out[40]:
age_binned  club_member_status
10's        ACTIVE                 53589
            LEFT CLUB                 33
            Not Active                89
            PRE-CREATE               878
20's        ACTIVE                386729
            LEFT CLUB                148
            Not Active              1332
            PRE-CREATE             18406
30's        ACTIVE                167011
            LEFT CLUB                 64
            Not Active              1383
            PRE-CREATE             16412
40's        ACTIVE                143292
            LEFT CLUB                 34
            Not Active              1104
            PRE-CREATE             14877
50's        ACTIVE                157636
            LEFT CLUB                 51
            Not Active               500
            PRE-CREATE             14798
60's        ACTIVE                 50954
            LEFT CLUB                 11
            Not Active               162
            PRE-CREATE              4470
70's        ACTIVE                 12371
            LEFT CLUB                  7
            Not Active                30
            PRE-CREATE              1074
80's        ACTIVE                   957
            LEFT CLUB                  0
            Not Active                 3
            PRE-CREATE                96
90's        ACTIVE                    60
            LEFT CLUB                  0
            Not Active                 1
            PRE-CREATE                13
dtype: int64
In [41]:
#updating plotly version
!pip install plotly --upgrade
Requirement already satisfied: plotly in c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages (5.22.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages (from plotly) (8.2.3)
Requirement already satisfied: packaging in c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages (from plotly) (22.0)
WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages)
WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages)
WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages)
WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages)
WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages)
WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages)
In [42]:
# How to Target Different Segments of Customers Based on Their Age and Club Membership?
details = cust.groupby(["age_binned","club_member_status"]).agg({"club_member_status" : ["count"]}).reset_index(drop=False)
col = ["_".join(i) if i[1] != "" else i[0] for i in list(details)]
details.columns = col

custom_palette = px.colors.qualitative.Set3
fig = px.treemap(details, path=['age_binned', 'club_member_status'], 
                 values='club_member_status_count',
                 color='club_member_status', 
                 color_discrete_sequence=custom_palette)
fig.update_layout(height=600, width=800, title={
        'text': 'Club Member Status Distribution across Age Groups',
        'x': 0.5,
        'y': 0.95
    })
fig.show()

WORKING WITH "TRANSACTIONS" DATA¶

In [43]:
trans.head(5)
Out[43]:
t_dat customer_id product_id price sales_channel_id
0 2018-09-20 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... 663713001 $101.66 2
1 2018-09-20 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... 541518023 $60.98 2
2 2018-09-20 00007d2de826758b65a93dd24ce629ed66842531df6699... 505221004 $30.47 2
3 2018-09-20 00007d2de826758b65a93dd24ce629ed66842531df6699... 685687003 $33.86 2
4 2018-09-20 00007d2de826758b65a93dd24ce629ed66842531df6699... 685687004 $33.86 2
In [44]:
trans.columns
Out[44]:
Index(['t_dat', 'customer_id', 'product_id', ' price ', 'sales_channel_id'], dtype='object')
In [45]:
#renaming ' price ' to 'price'
trans.rename(columns={' price ': 'price'}, inplace=True)
In [46]:
trans.columns
Out[46]:
Index(['t_dat', 'customer_id', 'product_id', 'price', 'sales_channel_id'], dtype='object')
In [47]:
trans.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   t_dat             1048575 non-null  object
 1   customer_id       1048575 non-null  object
 2   product_id        1048575 non-null  int64 
 3   price             1048575 non-null  object
 4   sales_channel_id  1048575 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 40.0+ MB
In [48]:
#
In [49]:
from termcolor import colored
print(f"Observations in Products dataset: {colored(prod.shape, 'red')}")
print(f"Observations in Customers dataset: {colored(cust.shape, 'green')}")
print(f"Observations in Transactions dataset: {colored(trans.shape, 'blue')}")
Observations in Products dataset: (105540, 15)
Observations in Customers dataset: (1048575, 10)
Observations in Transactions dataset: (1048575, 5)
In [50]:
print(colored(prod.columns, 'yellow'))
print(colored(cust.columns, 'green'))
print(colored(trans.columns, 'blue'))
Index(['product_id', 'product_code', 'product_type_no', 'product_type_name',
       'product_group_name', 'department_no', 'department_name', 'index_code',
       'index_group_name', 'graphical_appearance_no', 'graphical_appearance',
       'colour_group_code', 'colour_group_name', 'section_no',
       'clothing_group_no'],
      dtype='object')
Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code', 'age_groups',
       'purchase_quantity', 'age_binned'],
      dtype='object')
Index(['t_dat', 'customer_id', 'product_id', 'price', 'sales_channel_id'], dtype='object')
In [51]:
fig = px.histogram(prod, x = 'product_type_name', width = 800,height = 500,title = 'Product Type Distribution')
fig.show()
In [52]:
fig = px.histogram(cust, x = 'age', width = 800,height = 500,title = 'Customer Age Type Distribution')
fig.show()
In [53]:
import plotly.express as px
grouped_data = prod.groupby('product_type_name').size().reset_index(name='prod_count').sort_values('prod_count', ascending=False)

fig = px.bar(grouped_data, x='prod_count', y='product_type_name',
             color='prod_count',
             width=800, height=2000,
             title='Product Type Distribution by Count',
             labels={'product_type_name': 'Product Type', 'prod_count': 'Count'})

fig.show()
In [54]:
#
fig = px.bar(cust.groupby(['age']).count().sort_values('club_member_status',ascending=False),x='club_member_status',color ='club_member_status',
             width = 800,height = 1000,title = 'Customer Age wise Distribution')
fig.show()
In [55]:
#top 10 customer's by number of Transactions
transactions_byid = trans.groupby('customer_id').count()
transactions_byid.sort_values(by='price', ascending=False)['price'][:10]
Out[55]:
customer_id
75c54a755b8a467e53e0a4e01833deb029734feb22ad25438137925123a38f8b    170
560a3cc8223aed0616efdcb227aea89630480a5dd73d4567045fd522a5ce89f6    128
5b8aa448f9eb6235d403b43bf1279813a33527fcdee38de4cf1f674a5badf65b    123
78e25599369c983b39dce7214bf8fc54219646d69e4262046b813da423089cf3    112
8ecbac3466886ba06b611fc52dd86762a661f57a7ccd2a2138ae2439e36d7b37    111
e775206b7df6ce66793edc9ae5d1a3785e65a7c2cd35fedef50ba3cec26db582    110
2039c3a9cdd14183aae75ea2b5e52956e1b5d22b5ff4aa27da504c4f03ac8cd2    108
0152d53f51444891ea07013fd1fb8325415bb09bb6798a59359b21a8326d801b    108
2fdf822dbaad2b983b37e651a982bba24352a92c8a5c4c75be25c771f2af6d13    103
313eec305f72a967c9274266da85c0762fb4774c6da6a9b96f704f49b7b9587d    100
Name: price, dtype: int64
In [ ]:
#CUSTOMER SEGMENTATION
trans['transaction_count'] = trans.groupby('customer_id')['price'].transform('count')
cust = cust.merge(trans[['customer_id', 'transaction_count']], on='customer_id', how='left')

# Feature Selection for Customer Segmentation
customer_features = cust[['age', 'club_member_status', 'transaction_count']]

# Handling categorical data for 'club_member_status'
customer_features = pd.get_dummies(customer_features, columns=['club_member_status'], drop_first=True)

# Imputation of Missing Values
imputer = SimpleImputer(strategy='mean')
customer_features_imputed = imputer.fit_transform(customer_features)

# Data Preprocessing
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features_imputed)

# Applying K-means Clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Example with 5 clusters
clusters = kmeans.fit_predict(customer_features_scaled)
cust['cluster'] = clusters

# Evaluating Clusters
score = silhouette_score(customer_features_scaled, clusters)
print(f'Silhouette Score: {score}')

# Visualizing Clusters
plt.scatter(customer_features_scaled[:, 0], customer_features_scaled[:, 1], c=clusters, cmap='viridis')
plt.xlabel('Age')
plt.ylabel('Transaction Frequency')
plt.title('Customer Segments')
plt.show()
In [ ]: